Loading Dataset¶

In [51]:
import pandas as pd

file_path = '/content/slice_localization_data.csv'
df = pd.read_csv(file_path)

EDA¶

In [52]:
print(df.head())
   patientId  value0  value1  value2  value3  value4  value5  value6  value7  \
0          0     0.0     0.0     0.0     0.0     0.0     0.0   -0.25   -0.25   
1          0     0.0     0.0     0.0     0.0     0.0     0.0   -0.25   -0.25   
2          0     0.0     0.0     0.0     0.0     0.0     0.0   -0.25   -0.25   
3          0     0.0     0.0     0.0     0.0     0.0     0.0   -0.25   -0.25   
4          0     0.0     0.0     0.0     0.0     0.0     0.0   -0.25   -0.25   

   value8  ...  value375  value376  value377  value378  value379  value380  \
0   -0.25  ...     -0.25  0.980381       0.0       0.0       0.0       0.0   
1   -0.25  ...     -0.25  0.977008       0.0       0.0       0.0       0.0   
2   -0.25  ...     -0.25  0.977008       0.0       0.0       0.0       0.0   
3   -0.25  ...     -0.25  0.977008       0.0       0.0       0.0       0.0   
4   -0.25  ...     -0.25  0.976833       0.0       0.0       0.0       0.0   

   value381  value382  value383  reference  
0       0.0     -0.25     -0.25  21.803851  
1       0.0     -0.25     -0.25  21.745726  
2       0.0     -0.25     -0.25  21.687600  
3       0.0     -0.25     -0.25  21.629474  
4       0.0     -0.25     -0.25  21.571348  

[5 rows x 386 columns]
In [53]:
dataset_shape = df.shape
print("Dataset shape:", dataset_shape)
Dataset shape: (53500, 386)
In [54]:
categorical_columns = df.select_dtypes(include='object').columns
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns

categorical_count = len(categorical_columns)
numerical_count = len(numerical_columns)

print("Categorical columns:", categorical_columns)
print("Numerical columns:", numerical_columns)
print("Number of categorical columns:", categorical_count)
print("Number of numerical columns:", numerical_count)
Categorical columns: Index([], dtype='object')
Numerical columns: Index(['patientId', 'value0', 'value1', 'value2', 'value3', 'value4', 'value5',
       'value6', 'value7', 'value8',
       ...
       'value375', 'value376', 'value377', 'value378', 'value379', 'value380',
       'value381', 'value382', 'value383', 'reference'],
      dtype='object', length=386)
Number of categorical columns: 0
Number of numerical columns: 386
In [55]:
# Check for null values
null_counts = df.isnull().sum()
print("Columns with null values:")
for column, count in null_counts.items():
    if count > 0:
        print(f"{column}: {count} null values")
Columns with null values:
In [56]:
# Replace null values with mode
df_filled = df.fillna(df.mode().iloc[0])

# Check for null values in the filled DataFrame
null_counts_filled = df_filled.isnull().sum()

# Print if there are any null values
if null_counts_filled.any():
    print("Null values:")
    print(null_counts_filled)
else:
    print("No null values found.")
No null values found.

From the above dataset, we can see that variables named as ‘value0’, ‘value1’,.. ‘value383’ contain feature values of CT scan images for each patient. The last variable is ‘reference’. This ‘reference’ is our target variable and it contains the relative location of the CT slice.

In [57]:
df_filled.describe(include='all')
Out[57]:
patientId value0 value1 value2 value3 value4 value5 value6 value7 value8 ... value375 value376 value377 value378 value379 value380 value381 value382 value383 reference
count 53500.000000 53500.000000 53500.000000 53500.000000 53500.000000 53500.000000 53500.000000 53500.000000 53500.000000 53500.000000 ... 53500.000000 53500.000000 53500.000000 53500.000000 53500.000000 53500.000000 53500.000000 53500.000000 53500.000000 53500.000000
mean 47.075701 0.059627 0.071558 0.145819 0.218728 0.274762 0.276189 0.204531 0.062281 -0.042025 ... -0.029404 0.182913 0.320112 0.359373 0.342889 0.266091 0.083049 -0.031146 -0.154524 47.028039
std 27.414240 0.174243 0.196921 0.300270 0.359163 0.378862 0.369605 0.351294 0.292232 0.268391 ... 0.085817 0.383333 0.463517 0.478188 0.471811 0.437633 0.279734 0.098738 0.122491 22.347042
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 -0.250000 -0.250000 -0.250000 -0.250000 ... -0.250000 0.000000 0.000000 0.000000 0.000000 0.000000 -0.250000 -0.250000 -0.250000 1.738733
25% 23.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 -0.250000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 -0.250000 29.891607
50% 46.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 -0.250000 43.987893
75% 70.000000 0.000000 0.000000 0.000000 0.446429 0.684477 0.662382 0.441412 0.000000 0.000000 ... 0.000000 0.000000 0.996286 0.999677 0.999560 0.949478 0.000000 0.000000 0.000000 63.735059
max 96.000000 1.000000 1.000000 1.000000 1.000000 0.998790 0.996468 0.999334 1.000000 1.000000 ... 0.961279 1.000000 1.000000 1.000000 1.000000 1.000000 0.999857 0.996839 0.942851 97.489115

8 rows × 386 columns

In [58]:
# Getting unqiue values of the "Patient ID"
import numpy as np

np.unique(df_filled["patientId"])
Out[58]:
array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96])
In [59]:
print(df_filled.columns)
Index(['patientId', 'value0', 'value1', 'value2', 'value3', 'value4', 'value5',
       'value6', 'value7', 'value8',
       ...
       'value375', 'value376', 'value377', 'value378', 'value379', 'value380',
       'value381', 'value382', 'value383', 'reference'],
      dtype='object', length=386)
In [60]:
#dropping off unnecessary variable ‘patientId’, separating features and target variables.

df_copy = df_filled.drop(['patientId'], axis=1)
df_y = df_copy['reference']
df_x = df_copy.drop(['reference'], axis=1)
In [61]:
# Plot "Reference" column distplot
plt.figure(figsize=(12,8))
sns.distplot(df_y, bins=100)
<ipython-input-61-1c272ca86b41>:3: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(df_y, bins=100)
Out[61]:
<Axes: xlabel='reference', ylabel='Density'>

Split the Data

In [62]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.3, random_state=42)

# View the split data
print("X_train:")
print(X_train.head())
print("\nX_test:")
print(X_test.head())
print("\ny_train:")
print(y_train.head())
print("\ny_test:")
print(y_test.head())
X_train:
         value0    value1    value2    value3    value4    value5    value6  \
19113  0.000000  0.000000  0.000000  0.000000  0.939018  0.965932  0.873580   
40279  0.114286  0.020930  0.000000  0.963090  0.680756  0.558228  0.439762   
42189  0.000000  0.000000  0.000000  0.751979  0.000000  0.842081  0.904479   
30994  0.000000  0.000000  0.914286  0.855310  0.893836  0.000000  0.000000   
19373  0.000000  0.267206  0.904605  0.972478  0.856772  0.000000  0.000000   

         value7    value8  value9  ...  value374  value375  value376  \
19113  0.000000  0.000000   -0.25  ...       0.0       0.0  0.000000   
40279  0.898336  0.741252    0.00  ...       0.0       0.0  0.000000   
42189  0.943744  0.699692    0.00  ...       0.0       0.0  0.994347   
30994  0.000000  0.000000   -0.25  ...       0.0       0.0  0.000000   
19373  0.000000  0.000000   -0.25  ...       0.0       0.0  0.000000   

       value377  value378  value379  value380  value381  value382  value383  
19113  0.000000  0.000000  0.000000       0.0       0.0       0.0       0.0  
40279  0.000000  0.000000  0.000000       0.0       0.0       0.0       0.0  
42189  0.999256  0.999908  0.997149       0.0       0.0       0.0       0.0  
30994  0.000000  0.000000  0.000000       0.0       0.0       0.0       0.0  
19373  0.000000  0.000000  0.000000       0.0       0.0       0.0       0.0  

[5 rows x 384 columns]

X_test:
        value0    value1    value2    value3    value4    value5  value6  \
46266  0.00000  0.000000  0.000000  0.000000  0.000000  0.704285     0.0   
2778   0.86783  0.898305  0.169761  0.411677  0.950726  0.000000     0.0   
34408  0.00000  0.000000  0.000000  0.000000  0.000000  0.000000     0.0   
13871  0.00000  0.000000  0.000000  0.000000  0.797264  0.390216     0.0   
35994  0.00000  0.000000  0.000000  0.000000  0.000000  0.000000     0.0   

       value7  value8  value9  ...  value374  value375  value376  value377  \
46266    0.00   -0.25   -0.25  ...       0.0       0.0       0.0  0.797007   
2778    -0.25   -0.25   -0.25  ...       0.0       0.0       0.0  0.000000   
34408    0.00   -0.25   -0.25  ...       0.0       0.0       0.0  0.999131   
13871    0.00    0.00   -0.25  ...       0.0       0.0       0.0  0.000000   
35994    0.00   -0.25   -0.25  ...       0.0       0.0       0.0  0.000000   

       value378  value379  value380  value381  value382  value383  
46266  0.999385  0.999884  0.999754       0.0      0.00     -0.25  
2778   0.988959  0.985753  0.000000       0.0     -0.25     -0.25  
34408  0.999918  0.999959  0.999382       0.0      0.00     -0.25  
13871  0.000000  0.000000  0.000000       0.0      0.00     -0.25  
35994  0.000000  0.000000  0.000000       0.0      0.00     -0.25  

[5 rows x 384 columns]

y_train:
19113    91.005317
40279    26.270760
42189    33.807090
30994    79.645955
19373    81.673863
Name: reference, dtype: float64

y_test:
46266    43.349961
2778     12.119149
34408    39.070125
13871    88.928088
35994    60.022506
Name: reference, dtype: float64

Apply Linear Regression¶

In [63]:
from sklearn.linear_model import LinearRegression

lm = LinearRegression()
lm.fit(X_train, y_train)
Out[63]:
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [64]:
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score
#for test dataset

y_pred = lm.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('Mean Squared Error:', mse)
print('Mean Absolute Error:', mae)
print('R-squared:', r2)
Mean Squared Error: 68.29434562989341
Mean Absolute Error: 6.123343272084307
R-squared: 0.8624473194580933
In [65]:
#for training dataset

y_pred = lm.predict(X_train)

mse = mean_squared_error(y_train, y_pred)
mae = mean_absolute_error(y_train, y_pred)
r2 = r2_score(y_train, y_pred)

print('Mean Squared Error:', mse)
print('Mean Absolute Error:', mae)
print('R-squared:', r2)
Mean Squared Error: 67.82275427597651
Mean Absolute Error: 6.10251339719793
R-squared: 0.8645188670291543
In [66]:
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score
# Define the function to plot learning curves
def plotLearningCurves(X, y, step):
    m, n = X.shape
    maxVal = (int)(m / 10) * 10
    N_size_arr = np.arange(10, maxVal + 10, step)
    error_arr = np.zeros((len(N_size_arr), 2))  # Updated line
    index = 0

    # Fitting Model
    lm.fit(X, y)

    # Increasing train dataset size, "step" times in each iteration
    for i in N_size_arr:
        # Splitting Training dataset with size i into train and cross-validation sets
        X_train_subset = X_train[:i]
        y_train_subset = y_train[:i]

        # Computing both mean squared error of the training dataset and cross-validation datasets predictions
        error_arr[index, 0] = mean_squared_error(y_train_subset, lm.predict(X_train_subset))
        error_arr[index, 1] = mean_squared_error(y_test, lm.predict(X_test))

        # Increasing index by 1
        index += 1

    # Initializing the figure
    fig = plt.figure(figsize=(12, 8))
    ax = fig.add_axes([0, 0, 1, 1])
    ax.set_yscale('log')

    # Plotting "Training set size" vs. "Mean Squared Error" for both the training and cross-validation dataset's errors
    line1, = ax.plot(N_size_arr, error_arr[:, 0], c='red')
    line2, = ax.plot(N_size_arr, error_arr[:, 1], c='blue')

    # Adding labels and legends to our plot
    ax.set_xlabel("N (Training set size)")
    ax.set_ylabel("Mean Squared Error")

    ax.legend((line1, line2), ("Train Error", "Test Error"))

# Call the function to plot the learning curves
plotLearningCurves(X_train, y_train, 200)
In [67]:
# Predecting Reference values with the test dataset
y_pred = lm.predict(X_test)

# Plotting predictions vs. y_test
fig = plt.figure(figsize=(10, 6))
ax = fig.add_axes([0, 0, 1, 1])

ax.set_xlabel("Predictions")
ax.set_ylabel("Test Target Variable")
ax.plot(y_test, y_pred, 'bo', ms=1)

# Display the plot
plt.show()

Based on the above information, it appears that the model might be slightly overfitting. Overfitting occurs when a model learns the training data too well and performs poorly on unseen data.

The mean squared error (MSE) and mean absolute error (MAE) on the training dataset are slightly lower than on the test dataset. Additionally, the R-squared value on the training dataset is higher than on the test dataset.

High complexity model: Overfitting can occur when the model is too complex relative to the available data. With a large number of features (386) compared to the number of instances (53500), it's possible that the model has learned noise or irrelevant patterns in the training data.

PCA (Principal Component Analysis) process¶

Feature Standarization

  1. perform the PCA to obtain the principal components and their corresponding explained variance
In [68]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_df_x = scaler.fit_transform(df_x)
pca = PCA(0.75)
pca_vectors = pca.fit_transform(scaled_df_x)
for index, var in enumerate(pca.explained_variance_ratio_):
    print("Explained Variance ratio by Principal Component ", (index+1), " : ", var)
Explained Variance ratio by Principal Component  1  :  0.14855715257210392
Explained Variance ratio by Principal Component  2  :  0.12108312979894122
Explained Variance ratio by Principal Component  3  :  0.06474442177698231
Explained Variance ratio by Principal Component  4  :  0.03774984839961066
Explained Variance ratio by Principal Component  5  :  0.03510333274248252
Explained Variance ratio by Principal Component  6  :  0.025645225528655405
Explained Variance ratio by Principal Component  7  :  0.02330954175071417
Explained Variance ratio by Principal Component  8  :  0.021428275145390387
Explained Variance ratio by Principal Component  9  :  0.017614067903705406
Explained Variance ratio by Principal Component  10  :  0.01589391279292687
Explained Variance ratio by Principal Component  11  :  0.014032726489128131
Explained Variance ratio by Principal Component  12  :  0.012871921022909665
Explained Variance ratio by Principal Component  13  :  0.0121634919283602
Explained Variance ratio by Principal Component  14  :  0.01072092397679328
Explained Variance ratio by Principal Component  15  :  0.009760345345618412
Explained Variance ratio by Principal Component  16  :  0.009482151161428901
Explained Variance ratio by Principal Component  17  :  0.008695231697355841
Explained Variance ratio by Principal Component  18  :  0.008205878792193493
Explained Variance ratio by Principal Component  19  :  0.008041977514589788
Explained Variance ratio by Principal Component  20  :  0.007543652519555233
Explained Variance ratio by Principal Component  21  :  0.006931981984392693
Explained Variance ratio by Principal Component  22  :  0.00646707766389355
Explained Variance ratio by Principal Component  23  :  0.006083596311372858
Explained Variance ratio by Principal Component  24  :  0.005868832669382626
Explained Variance ratio by Principal Component  25  :  0.005741237892731493
Explained Variance ratio by Principal Component  26  :  0.005545363955500639
Explained Variance ratio by Principal Component  27  :  0.0053812466975905315
Explained Variance ratio by Principal Component  28  :  0.005160938036779552
Explained Variance ratio by Principal Component  29  :  0.005084092580391446
Explained Variance ratio by Principal Component  30  :  0.004799621223915218
Explained Variance ratio by Principal Component  31  :  0.004652832065005538
Explained Variance ratio by Principal Component  32  :  0.004574687218755086
Explained Variance ratio by Principal Component  33  :  0.004478718115698722
Explained Variance ratio by Principal Component  34  :  0.004312607244892595
Explained Variance ratio by Principal Component  35  :  0.004197561850896129
Explained Variance ratio by Principal Component  36  :  0.004096225788266026
Explained Variance ratio by Principal Component  37  :  0.004050004696179025
Explained Variance ratio by Principal Component  38  :  0.0040203530745072285
Explained Variance ratio by Principal Component  39  :  0.0039032597942033425
Explained Variance ratio by Principal Component  40  :  0.0037259511080387207
Explained Variance ratio by Principal Component  41  :  0.0036811022474497667
Explained Variance ratio by Principal Component  42  :  0.003532807590921863
Explained Variance ratio by Principal Component  43  :  0.0035221449980117033
Explained Variance ratio by Principal Component  44  :  0.0033683692074793284
Explained Variance ratio by Principal Component  45  :  0.003324213440610176
Explained Variance ratio by Principal Component  46  :  0.0032417350414558144
Explained Variance ratio by Principal Component  47  :  0.003097464371346726
Explained Variance ratio by Principal Component  48  :  0.0030493180022711022
Explained Variance ratio by Principal Component  49  :  0.0030174010370553972
Explained Variance ratio by Principal Component  50  :  0.002893053408362604
Explained Variance ratio by Principal Component  51  :  0.002831606959181239
Explained Variance ratio by Principal Component  52  :  0.002775687439238411

we are able to reduce dimensions from 384 to 52

  1. perform the PCA to obtain the principal components and their corresponding explained variance
In [69]:
import numpy as np
import matplotlib.pyplot as plt

# Calculate the explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_

# Calculate the cumulative explained variance
cumulative_variance = np.cumsum(explained_variance_ratio)

# Plot the explained variance ratio and cumulative explained variance
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio, marker='o', label='Explained Variance Ratio')
plt.plot(range(1, len(explained_variance_ratio) + 1), cumulative_variance, marker='o', label='Cumulative Explained Variance')
plt.xlabel('Number of Principal Components')
plt.ylabel('Explained Variance Ratio / Cumulative Explained Variance')
plt.title('Explained Variance Ratio and Cumulative Explained Variance')
plt.legend()
plt.show()
In [70]:
pca_vectors = pca.fit_transform(scaled_df_x)

# Access and print the pca_vectors
print("PCA Vectors:")
print(pca_vectors)
PCA Vectors:
[[ 1.66418097e+01 -5.28637854e+00  5.90813596e+00 ...  1.47466546e+00
  -6.07350165e-03  7.87119195e-01]
 [ 1.65943085e+01 -4.94762736e+00  6.12864705e+00 ...  1.18524823e+00
  -1.03928883e-01  1.02606483e+00]
 [ 1.65927230e+01 -4.91301981e+00  6.17570643e+00 ...  1.15494987e+00
  -1.00923531e-01  8.95570094e-01]
 ...
 [-4.79214559e+00  1.42699675e+01  5.57219567e-01 ... -7.52523437e-01
   1.66459189e-01  7.02028650e-02]
 [ 1.73465338e+01 -3.01729764e+00  6.15848484e+00 ... -5.72477971e-01
  -1.88995484e-01  1.73028394e+00]
 [ 1.75212840e+01 -2.49155501e+00  5.31833961e+00 ... -4.57566760e-01
  -3.26204443e-01  1.49666375e+00]]
In [71]:
import seaborn as sns

# Select a subset of principal components
subset_pca = pca_vectors[:, :20]  # Adjust the number of components as desired

# Create a DataFrame with the subset of principal components
df_subset_pca = pd.DataFrame(subset_pca, columns=[f'Principal Component {i+1}' for i in range(subset_pca.shape[1])])

# Create a scatter plot matrix
sns.set(style='ticks')
sns.pairplot(df_subset_pca)
plt.suptitle('Scatter Plot Matrix of Subset Principal Components')
plt.show()
In [74]:
%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="ticks")
sns.set_palette(palette='Set1')

fig_1 = plt.figure(figsize=(18,6))

sns.set(style="ticks")
sns.set_palette(palette='Set1')

sns.regplot(x=pca_vectors[:,0],y=df_y, label='Principal Component 1',x_bins=10)
sns.regplot(x=pca_vectors[:,1],y=df_y, label='Principal Component 2',x_bins=10)
sns.regplot(x=pca_vectors[:,2],y=df_y, label='Principal Component 3',x_bins=10)

plt.title('Most Important Principal Components vs Reference Value')
plt.xlabel('Principal Component Value')
plt.ylabel('Reference Value')
plt.legend()
plt.show()
In [75]:
import matplotlib.pyplot as plt
fig_2 = plt.figure(figsize=(18,6))

sns.regplot(x=pca_vectors[:,51],y=df_y, label='Principal Component 52',x_bins=10)
sns.regplot(x=pca_vectors[:,50],y=df_y, label='Principal Component 51',x_bins=10)
sns.regplot(x=pca_vectors[:,49],y=df_y, label='Principal Component 50',x_bins=10)

plt.title('Least Important Principal Components vs Reference Value')
plt.xlabel('Principal Component Value')
plt.ylabel('Reference Value')
plt.legend()
plt.show()

Ridge Regression¶

In [76]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

train_x, test_x, train_y, test_y = train_test_split(pca_vectors, df_y)
In [77]:
from sklearn.linear_model import Ridge

# Create a Ridge regression model
ridge = Ridge(alpha=0.3)  # You can adjust the regularization strength with the alpha parameter

# Fit the model on the training data
ridge.fit(X_train, y_train)
Out[77]:
Ridge(alpha=0.3)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Ridge(alpha=0.3)
In [78]:
# Evaluate the model on the testing set
y_pred = ridge.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('Mean Squared Error:', mse)
print('Mean Absolute Error:', mae)
print('R-squared:', r2)
Mean Squared Error: 68.23671966387057
Mean Absolute Error: 6.119286445447472
R-squared: 0.8625633847929632
In [79]:
# Predict the target variable on the training set
y_train_pred = ridge.predict(X_train)

# Calculate evaluation metrics on the training set
train_mse = mean_squared_error(y_train, y_train_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)

print('Training Set Metrics:')
print('Mean Squared Error:', train_mse)
print('Mean Absolute Error:', train_mae)
print('R-squared:', train_r2)
Training Set Metrics:
Mean Squared Error: 67.79810610837276
Mean Absolute Error: 6.101670361728603
R-squared: 0.8645681036269341
In [80]:
# Predecting Reference values with the test dataset
y_pred = ridge.predict(X_test)

# Plotting predictions vs. y_test
fig = plt.figure(figsize=(12, 8))
ax = fig.add_axes([0, 0, 1, 1])

ax.set_xlabel("Predictions")
ax.set_ylabel("Test Target Variable")
ax.plot(y_test, y_pred, 'bo', ms=1)

# Display the plot
plt.show()

ElasticNetCV¶

In [81]:
# Fit an Elastic Net regression model
from sklearn.linear_model import ElasticNetCV
regr_en = ElasticNetCV(cv=5, alphas=[0.1, 0.3, 0.5, 0.7, 1.0])
regr_en.fit(X_train, y_train)
Out[81]:
ElasticNetCV(alphas=[0.1, 0.3, 0.5, 0.7, 1.0], cv=5)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
ElasticNetCV(alphas=[0.1, 0.3, 0.5, 0.7, 1.0], cv=5)
In [82]:
# Evaluate the model on the testing set
y_pred = regr_en.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('Mean Squared Error:', mse)
print('Mean Absolute Error:', mae)
print('R-squared:', r2)

# Iterate and fine-tune the model as needed by adjusting PCA components, alpha, l1_ratio, or other parameters
# Evaluate the model's performance and repeat the process until satisfactory results are achieved.
Mean Squared Error: 88.49517192933334
Mean Absolute Error: 7.003583587739677
R-squared: 0.8217605278793605
In [83]:
# Predict the target variable on the training set
y_train_pred = regr_en.predict(X_train)

# Calculate evaluation metrics on the training set
train_mse = mean_squared_error(y_train, y_train_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)

print('Training Set Metrics:')
print('Mean Squared Error:', train_mse)
print('Mean Absolute Error:', train_mae)
print('R-squared:', train_r2)
Training Set Metrics:
Mean Squared Error: 90.0002231532398
Mean Absolute Error: 7.057476007992971
R-squared: 0.8202176787039026
In [84]:
# Predecting Reference values with the test dataset
y_pred = regr_en.predict(X_test)

# Plotting predictions vs. y_test
fig = plt.figure(figsize=(12, 8))
ax = fig.add_axes([0, 0, 1, 1])

ax.set_xlabel("Predictions")
ax.set_ylabel("Test Target Variable")
ax.plot(y_test, y_pred, 'bo', ms=1)

# Display the plot
plt.show()